Github: https://github.com/vickynugget/DSI_DataChallenge5.git
# load library
library(tidyverse)
library(janitor)
library(ggplot2)
library(GGally)
library(plotly)
# read data
nndb_flat <- read.csv('nndb_flat.csv')
nndb_flat <- nndb_flat %>%
# filter the data to contain only specific food groups
filter(FoodGroup == 'Vegetables and Vegetable Products' |
FoodGroup == 'Beef Products' |
FoodGroup == 'Sweets')
nndb_data <- nndb_flat %>%
# select only the variables from Energy_kcal to Zinc_mg
select(Energy_kcal:Zinc_mg)
# examine the correlation among the variables
ggcorr(nndb_data,
hjust = 1,
size = 2,
layout.exp = 2) # adjust label
Manganese and Vitamin A, Thiamin and Folate have the highest correlations and they are positive correlated. Zinc and protein, fat and energy, sugar and carb, Thiamin and Riboflavin also have high positive correlations.
# perform PCA and scale the data
pca_nndb <- prcomp(nndb_data, center = TRUE, scale. = TRUE)
# Calculate the Cumulative proportion of the variation explained by each PC
var <- pca_nndb$sdev^2
cum <- cumsum(var/sum(var))
# Format into a data frame
var_explained_df <- data.frame(PC = paste0("PC",1:23),
var_explained = cum)
# format the order of PCs
var_explained_df$PC <- factor(var_explained_df$PC, levels = var_explained_df$PC)
var_explained_df %>%
# initialize ggplot with PC on the x-axis and cumulative variation explained on the y-axis
ggplot(aes(x = PC, y = var_explained, group = 1)) +
geom_point() + # add points to the plot
geom_line() + # add lines to the plot
labs(title = "Cumulative proportion of the variation explained", # add title
y = 'Cumulated Variance Explained', # add y-axis label
x = 'PC') + # add x-axis label
theme(axis.text.x=element_text(angle = 40, hjust = 1)) # rotate the x labels
# get loadings for the first 3 PCs which explain about 60% of the variation in the data
pca_nndb_loadings <- as.data.frame(pca_nndb$rotation) %>% # get the loadings
select(PC1, PC2, PC3) %>% # select first 3 PCs
mutate(variable = rownames(pca_nndb$rotation)) %>% # add variable names
pivot_longer(cols = c('PC1', 'PC2', 'PC3'), # transfer the data into longer format
names_to = 'PC',
values_to = 'loadings')
# make 3 separate plots for the loadings for the first 3 PCs for all of the variables
for (i in c('PC1', 'PC2', 'PC3')){
plot <- pca_nndb_loadings %>%
filter(PC == i) %>% # filter out the loadings for specific PC
ggplot(aes(x = reorder(variable, abs(loadings)), y = loadings)) + # initialize ggplot
geom_bar(stat = 'identity') + # make a bar plot
labs(title = paste("Loadings for", i), # add title
y = 'Loadings', # add y-axis label
x = 'Variable') + # add x-axis label
theme(axis.text.x = element_text(angle=40, hjust=1)) # rotate the x labels
# output the plot
print(plot)
}
# create a data frame of scores
pca_scores <- as.data.frame(pca_nndb$x) %>% # get the scores
mutate(FoodGroup = nndb_flat$FoodGroup) # add the food group column
# PC1 versus PC2
pc1_pc2 <- ggplot(pca_scores,
aes(x = PC1, y = PC2, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC1 vs PC2') # add a title
ggplotly(pc1_pc2) # interactive plot
# PC1 versus PC3
pc1_pc3 <- ggplot(pca_scores,
aes(x = PC1, y = PC3, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC1 vs PC3') # add a title
ggplotly(pc1_pc3) # interactive plot
# PC2 versus PC3
pc2_pc3 <- ggplot(pca_scores,
aes(x = PC2, y = PC3, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC2 vs PC3') # add a title
ggplotly(pc2_pc3) # interactive plot
The outlier is in the Vegetables and Vegetables Products group.
# find the index of the outlier, which has PC1 greater than 30
outlier_idx <- which(pca_scores[,'PC1'] > 30)
# remove the outlier
nndb_flat_new <- nndb_flat[-outlier_idx,]
nndb_data_new <- nndb_flat_new %>%
# select only the variables from Energy_kcal to Zinc_mg
select(Energy_kcal:Zinc_mg)
# perform PCA and scale the data
pca_nndb <- prcomp(nndb_data_new, center = TRUE, scale. = TRUE)
# format proportion of the variation explained by each PC into a data frame
var <- pca_nndb$sdev^2
cum <- cumsum(var/sum(var))
# Format into a data frame
var_explained_df <- data.frame(PC = paste0("PC",1:23),
var_explained = cum)
# format the order of PCs
var_explained_df$PC <- factor(var_explained_df$PC, levels = var_explained_df$PC)
# Make a plot showing the cumulative proportion of the variation explained by each PC
var_explained_df %>%
# initialize ggplot with PC on the x-axis and cumulative variation explained on the y-axis
ggplot(aes(x = PC, y = var_explained, group = 1)) +
geom_point() + # add points to the plot
geom_line() + # add lines to the plot
labs(title = "Cumulative proportion of the variation explained", # add title
y = 'Cumulated Variance Explained', # add y-axis label
x = 'PC') + # add x-axis label
theme(axis.text.x=element_text(angle = 40, hjust = 1)) # rotate the x labels
# get loadings for the first 3 PCs which explain about 60% of the variation in the data
pca_nndb_loadings <- as.data.frame(pca_nndb$rotation) %>% # get the loadings
select(PC1, PC2, PC3) %>% # select first 3 PCs
mutate(variable = rownames(pca_nndb$rotation)) %>% # add variable names
pivot_longer(cols = c('PC1', 'PC2', 'PC3'), # transfer the data into longer format
names_to = 'PC',
values_to = 'loadings')
# make 3 separate plots for the loadings for the first 3 PCs for all of the variables
for (i in c('PC1', 'PC2', 'PC3')){
plot <- pca_nndb_loadings %>%
filter(PC == i) %>% # filter out the loadings for specific PC
ggplot(aes(x = reorder(variable, abs(loadings)), y = loadings)) + # initialize ggplot
geom_bar(stat = 'identity') + # make a bar plot
labs(title = paste("Loadings for", i), # add title
y = 'Loadings', # add y-axis label
x = 'Variable') + # add x-axis label
theme(axis.text.x = element_text(angle = 40, hjust = 1)) # rotate the x labels
# output the plot
print(plot)
}
# make 3 plots of the scores on the PCs colored by food group
# create a data frame of scores
pca_scores <- as.data.frame(pca_nndb$x) %>% # get the scores
mutate(FoodGroup = nndb_flat_new$FoodGroup) # add the food group column
# PC1 versus PC2
pc1_pc2 <- ggplot(pca_scores,
aes(x = PC1, y = PC2, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC1 vs PC2') # add a title
ggplotly(pc1_pc2) # interactive plot
# PC1 versus PC3
pc1_pc3 <- ggplot(pca_scores,
aes(x = PC1, y = PC3, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC1 vs PC3') # add a title
ggplotly(pc1_pc3) # interactive plot
# PC2 versus PC3
pc2_pc3 <- ggplot(pca_scores,
aes(x = PC2, y = PC3, col = FoodGroup)) + # initialize ggplot
geom_point() + # make a scatter plot
labs(title = 'PC2 vs PC3') # add a title
ggplotly(pc2_pc3) # interactive plot
Yes. Because we scale the data before performing PCA, so after remore the outliers the value after scaling will change. The loadings and score of PCA will change as a result.
From the plot of PC1 and PC2 loadings shows most of the loading were positive, while most loadings of PC3 were negative. So in the plot of scores, most scores of PC1 and PC2 lies in the first quadrant, in the plot of PC1 and PC3 most points lies below x-axis and left of the y-axis. In the plot of PC2 and PC3 most points lies below x-axis and right of y-axis.